class: center, middle, inverse, title-slide # Databases ### Kirill & Nicolas ### cynkra GmbH ### March 15, 2022 --- <style type="text/css"> .pull-left { margin-top: -25px; } .pull-right { margin-top: -25px; } .remark-code { font-size: 14px; } .font17 { font-size: 17px; } .font14 { font-size: 14px; } </style> --- # Schedule Extract ⇨ Transform ⇨ Load ⇨ Consume .pull-left[ ## One table - Read whole tables - Let the database do the heavy lifting - Basic ETL for one table - Subtle issues to watch out for ] .pull-right[ ## Multiple tables - **Joins** - The {dm} package - A bit of theory - Playing the whole game ] ## Bonus: Bring your own data + questions --- background-image: url("data:image/png;base64,#images/21.webp") background-size: 40% background-position: 100% 100% # Joins .pull-left[ - Usage - Join sources - Mounting Script: `databases_21.R` ```r library(tidyverse) ``` ] --- background-image: url("data:image/png;base64,#images/21-frame.webp") background-size: 40% background-position: 100% 100% # Joins .pull-left[ - Usage - Join sources - Mounting Script: `databases_21.R` ```r library(tidyverse) ``` ] --- # A second table .pull-left[ ```r academy <- tbl(con_duckdb, "academy") academy_sqlite <- tbl(con_sqlite, "academy") academy %>% count(status) ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># Source: lazy query [?? x 2]</span> <span style='color: #949494;'># Database: duckdb_connection</span> <span style='font-weight: bold;'>status</span> <span style='font-weight: bold;'>n</span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #BCBCBC;'>1</span> Award not yet introduced 3 <span style='color: #BCBCBC;'>2</span> Nominated 36 <span style='color: #BCBCBC;'>3</span> Ineligible 23 <span style='color: #BCBCBC;'>4</span> Won Special Achievement 1 <span style='color: #BCBCBC;'>5</span> Won 17 </CODE></PRE> ] .pull-right[ ```r academy ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># Source: table<academy> [?? x 3]</span> <span style='color: #949494;'># Database: duckdb_connection</span> <span style='font-weight: bold;'>film</span> <span style='font-weight: bold;'>award_type</span> <span style='font-weight: bold;'>status</span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #BCBCBC;'> 1</span> Toy Story Animated Feature Award not yet introduced <span style='color: #BCBCBC;'> 2</span> Toy Story Original Screenplay Nominated <span style='color: #BCBCBC;'> 3</span> Toy Story Adapted Screenplay Ineligible <span style='color: #BCBCBC;'> 4</span> Toy Story Original Score Nominated <span style='color: #BCBCBC;'> 5</span> Toy Story Original Song Nominated <span style='color: #BCBCBC;'> 6</span> Toy Story Other Won Special Achievement <span style='color: #BCBCBC;'> 7</span> A Bug's Life Animated Feature Award not yet introduced <span style='color: #BCBCBC;'> 8</span> A Bug's Life Adapted Screenplay Ineligible <span style='color: #BCBCBC;'> 9</span> A Bug's Life Original Score Nominated <span style='color: #BCBCBC;'>10</span> Toy Story 2 Animated Feature Award not yet introduced <span style='color: #949494;'># … with more rows</span> </CODE></PRE> ] --- # Left join The most frequent kind of join. .pull-left[ ## Unsafe ```r academy %>% left_join(pixar_films) ``` <PRE class="fansi fansi-message"><CODE>Joining, by = "film" </CODE></PRE><PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># Source: lazy query [?? x 7]</span> <span style='color: #949494;'># Database: duckdb_connection</span> <span style='font-weight: bold;'>film</span> <span style='font-weight: bold;'>award_type</span> <span style='font-weight: bold;'>status</span> <span style='font-weight: bold;'>number</span> <span style='font-weight: bold;'>release_d…</span> <span style='font-weight: bold;'>run_t…</span> <span style='font-weight: bold;'>film_…</span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><date></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #BCBCBC;'> 1</span> Toy Story Animated Fea… Award… 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 2</span> Toy Story Original Scr… Nomin… 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 3</span> Toy Story Adapted Scre… Ineli… 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 4</span> Toy Story Original Sco… Nomin… 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 5</span> Toy Story Original Song Nomin… 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 6</span> Toy Story Other Won S… 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 7</span> A Bug's Life Animated Fea… Award… 2 1998-11-25 95 G <span style='color: #BCBCBC;'> 8</span> A Bug's Life Adapted Scre… Ineli… 2 1998-11-25 95 G <span style='color: #BCBCBC;'> 9</span> A Bug's Life Original Sco… Nomin… 2 1998-11-25 95 G <span style='color: #BCBCBC;'>10</span> Toy Story 2 Animated Fea… Award… 3 1999-11-24 92 G <span style='color: #949494;'># … with more rows</span> </CODE></PRE> ] .pull-right[ ## Better ```r academy %>% left_join(pixar_films, by = "film") ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># Source: lazy query [?? x 7]</span> <span style='color: #949494;'># Database: duckdb_connection</span> <span style='font-weight: bold;'>film</span> <span style='font-weight: bold;'>award_type</span> <span style='font-weight: bold;'>status</span> <span style='font-weight: bold;'>number</span> <span style='font-weight: bold;'>release_d…</span> <span style='font-weight: bold;'>run_t…</span> <span style='font-weight: bold;'>film_…</span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><date></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #BCBCBC;'> 1</span> Toy Story Animated Fea… Award… 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 2</span> Toy Story Original Scr… Nomin… 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 3</span> Toy Story Adapted Scre… Ineli… 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 4</span> Toy Story Original Sco… Nomin… 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 5</span> Toy Story Original Song Nomin… 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 6</span> Toy Story Other Won S… 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 7</span> A Bug's Life Animated Fea… Award… 2 1998-11-25 95 G <span style='color: #BCBCBC;'> 8</span> A Bug's Life Adapted Scre… Ineli… 2 1998-11-25 95 G <span style='color: #BCBCBC;'> 9</span> A Bug's Life Original Sco… Nomin… 2 1998-11-25 95 G <span style='color: #BCBCBC;'>10</span> Toy Story 2 Animated Fea… Award… 3 1999-11-24 92 G <span style='color: #949494;'># … with more rows</span> </CODE></PRE> ] --- # Left join Computed on the database, original data unchanged. ```r academy %>% left_join(pixar_films, by = "film") %>% show_query() ``` ``` <SQL> SELECT "LHS"."film" AS "film", "award_type", "status", "number", "release_date", "run_time", "film_rating" FROM "academy" AS "LHS" LEFT JOIN "pixar_films" AS "RHS" ON ("LHS"."film" = "RHS"."film") ``` --- # Join with preparation The right-hand side in the join should come from a variable. .pull-left[ ## Prepare RHS ```r academy_won <- academy %>% filter(status == "Won") %>% count(film, name = "n_won") academy_won ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># Source: lazy query [?? x 2]</span> <span style='color: #949494;'># Database: duckdb_connection</span> <span style='font-weight: bold;'>film</span> <span style='font-weight: bold;'>n_won</span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #BCBCBC;'> 1</span> Monsters, Inc. 1 <span style='color: #BCBCBC;'> 2</span> Finding Nemo 1 <span style='color: #BCBCBC;'> 3</span> The Incredibles 2 <span style='color: #BCBCBC;'> 4</span> Ratatouille 1 <span style='color: #BCBCBC;'> 5</span> WALL-E 1 <span style='color: #BCBCBC;'> 6</span> Up 2 <span style='color: #BCBCBC;'> 7</span> Toy Story 3 2 <span style='color: #BCBCBC;'> 8</span> Brave 1 <span style='color: #BCBCBC;'> 9</span> Inside Out 1 <span style='color: #BCBCBC;'>10</span> Coco 2 <span style='color: #949494;'># … with more rows</span> </CODE></PRE> ] .pull-right[ ```r pixar_films %>% left_join(academy_won, by = "film") ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># Source: lazy query [?? x 6]</span> <span style='color: #949494;'># Database: duckdb_connection</span> <span style='font-weight: bold;'>number</span> <span style='font-weight: bold;'>film</span> <span style='font-weight: bold;'>release_date</span> <span style='font-weight: bold;'>run_time</span> <span style='font-weight: bold;'>film_rating</span> <span style='font-weight: bold;'>n_won</span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><date></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #BCBCBC;'> 1</span> 4 Monsters, Inc. 2001-11-02 92 G 1 <span style='color: #BCBCBC;'> 2</span> 5 Finding Nemo 2003-05-30 100 G 1 <span style='color: #BCBCBC;'> 3</span> 6 The Incredibles 2004-11-05 115 PG 2 <span style='color: #BCBCBC;'> 4</span> 8 Ratatouille 2007-06-29 111 G 1 <span style='color: #BCBCBC;'> 5</span> 9 WALL-E 2008-06-27 98 G 1 <span style='color: #BCBCBC;'> 6</span> 10 Up 2009-05-29 96 PG 2 <span style='color: #BCBCBC;'> 7</span> 11 Toy Story 3 2010-06-18 103 G 2 <span style='color: #BCBCBC;'> 8</span> 13 Brave 2012-06-22 93 PG 1 <span style='color: #BCBCBC;'> 9</span> 15 Inside Out 2015-06-19 95 PG 1 <span style='color: #BCBCBC;'>10</span> 19 Coco 2017-11-22 105 PG 2 <span style='color: #949494;'># … with more rows</span> </CODE></PRE> ] --- # Join with postprocessing .pull-left[ ## Raw result ```r pixar_films %>% left_join(academy_won, by = "film") %>% arrange(release_date) ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># Source: lazy query [?? x 6]</span> <span style='color: #949494;'># Database: duckdb_connection</span> <span style='color: #949494;'># Ordered by: release_date</span> <span style='font-weight: bold;'>number</span> <span style='font-weight: bold;'>film</span> <span style='font-weight: bold;'>release_date</span> <span style='font-weight: bold;'>run_time</span> <span style='font-weight: bold;'>film_rating</span> <span style='font-weight: bold;'>n_won</span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><date></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #BCBCBC;'> 1</span> 1 Toy Story 1995-11-22 81 G <span style='color: #BB0000;'>NA</span> <span style='color: #BCBCBC;'> 2</span> 2 A Bug's Life 1998-11-25 95 G <span style='color: #BB0000;'>NA</span> <span style='color: #BCBCBC;'> 3</span> 3 Toy Story 2 1999-11-24 92 G <span style='color: #BB0000;'>NA</span> <span style='color: #BCBCBC;'> 4</span> 4 Monsters, Inc. 2001-11-02 92 G 1 <span style='color: #BCBCBC;'> 5</span> 5 Finding Nemo 2003-05-30 100 G 1 <span style='color: #BCBCBC;'> 6</span> 6 The Incredibles 2004-11-05 115 PG 2 <span style='color: #BCBCBC;'> 7</span> 7 Cars 2006-06-09 117 G <span style='color: #BB0000;'>NA</span> <span style='color: #BCBCBC;'> 8</span> 8 Ratatouille 2007-06-29 111 G 1 <span style='color: #BCBCBC;'> 9</span> 9 WALL-E 2008-06-27 98 G 1 <span style='color: #BCBCBC;'>10</span> 10 Up 2009-05-29 96 PG 2 <span style='color: #949494;'># … with more rows</span> </CODE></PRE> ] .pull-right[ ## After postprocessing ```r pixar_films %>% left_join(academy_won, by = "film") %>% mutate(n_won = coalesce(n_won, 0L)) %>% arrange(release_date) ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># Source: lazy query [?? x 6]</span> <span style='color: #949494;'># Database: duckdb_connection</span> <span style='color: #949494;'># Ordered by: release_date</span> <span style='font-weight: bold;'>number</span> <span style='font-weight: bold;'>film</span> <span style='font-weight: bold;'>release_date</span> <span style='font-weight: bold;'>run_time</span> <span style='font-weight: bold;'>film_rating</span> <span style='font-weight: bold;'>n_won</span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><date></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #BCBCBC;'> 1</span> 1 Toy Story 1995-11-22 81 G 0 <span style='color: #BCBCBC;'> 2</span> 2 A Bug's Life 1998-11-25 95 G 0 <span style='color: #BCBCBC;'> 3</span> 3 Toy Story 2 1999-11-24 92 G 0 <span style='color: #BCBCBC;'> 4</span> 4 Monsters, Inc. 2001-11-02 92 G 1 <span style='color: #BCBCBC;'> 5</span> 5 Finding Nemo 2003-05-30 100 G 1 <span style='color: #BCBCBC;'> 6</span> 6 The Incredibles 2004-11-05 115 PG 2 <span style='color: #BCBCBC;'> 7</span> 7 Cars 2006-06-09 117 G 0 <span style='color: #BCBCBC;'> 8</span> 8 Ratatouille 2007-06-29 111 G 1 <span style='color: #BCBCBC;'> 9</span> 9 WALL-E 2008-06-27 98 G 1 <span style='color: #BCBCBC;'>10</span> 10 Up 2009-05-29 96 PG 2 <span style='color: #949494;'># … with more rows</span> </CODE></PRE> ] --- # Join with processing Computed on the database, original data unchanged. ```r pixar_films %>% left_join(academy_won, by = "film") %>% mutate(n_won = coalesce(n_won, 0L)) %>% arrange(release_date) %>% show_query() ``` ``` <SQL> SELECT "number", "film", "release_date", "run_time", "film_rating", COALESCE("n_won", 0) AS "n_won" FROM (SELECT "number", "LHS"."film" AS "film", "release_date", "run_time", "film_rating", "n_won" FROM "pixar_films" AS "LHS" LEFT JOIN (SELECT "film", COUNT(*) AS "n_won" FROM "academy" WHERE ("status" = 'Won') GROUP BY "film") "RHS" ON ("LHS"."film" = "RHS"."film") ) "q01" ORDER BY "release_date" ``` --- # Tables must be on the same source Use `copy = TRUE` to enforce, the result is a lazy table if the LHS is a lazy table. .pull-left[ ## Bad ```r try( academy %>% left_join(pixar_films_sqlite, by = "film") ) ``` <PRE class="fansi fansi-output"><CODE>Error in auto_copy(x, y, copy = copy, indexes = if (auto_index) list(by$y)) : `x` and `y` must share the same src. <span style='color: #00BBBB;'>ℹ</span> set `copy` = TRUE (may be slow). </CODE></PRE> ] .pull-right[ ## Not too bad ```r academy %>% left_join(pixar_films_sqlite, by = "film", copy = TRUE) ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># Source: lazy query [?? x 7]</span> <span style='color: #949494;'># Database: duckdb_connection</span> <span style='font-weight: bold;'>film</span> <span style='font-weight: bold;'>award_type</span> <span style='font-weight: bold;'>status</span> <span style='font-weight: bold;'>number</span> <span style='font-weight: bold;'>release_d…</span> <span style='font-weight: bold;'>run_t…</span> <span style='font-weight: bold;'>film_…</span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><date></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #BCBCBC;'> 1</span> Toy Story Animated Fea… Award… 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 2</span> Toy Story Original Scr… Nomin… 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 3</span> Toy Story Adapted Scre… Ineli… 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 4</span> Toy Story Original Sco… Nomin… 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 5</span> Toy Story Original Song Nomin… 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 6</span> Toy Story Other Won S… 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 7</span> A Bug's Life Animated Fea… Award… 2 1998-11-25 95 G <span style='color: #BCBCBC;'> 8</span> A Bug's Life Adapted Scre… Ineli… 2 1998-11-25 95 G <span style='color: #BCBCBC;'> 9</span> A Bug's Life Original Sco… Nomin… 2 1998-11-25 95 G <span style='color: #BCBCBC;'>10</span> Toy Story 2 Animated Fea… Award… 3 1999-11-24 92 G <span style='color: #949494;'># … with more rows</span> </CODE></PRE> ] --- # Copying is expensive! A temporary table is created on the LHS database. If the RHS comes from a different database, results are temporarily loaded into the local session! ```r academy %>% left_join(pixar_films_sqlite, by = "film", copy = TRUE) %>% show_query() ``` ``` <SQL> SELECT "LHS"."film" AS "film", "award_type", "status", "number", "release_date", "run_time", "film_rating" FROM "academy" AS "LHS" LEFT JOIN "dbplyr_002" AS "RHS" ON ("LHS"."film" = "RHS"."film") ``` --- # Joining data frames with lazy tables The result is a data frame too. .pull-left[ ## Bad ```r try( pixarfilms::academy %>% left_join(pixar_films, by = "film") ) ``` <PRE class="fansi fansi-output"><CODE>Error in auto_copy(x, y, copy = copy) : `x` and `y` must share the same src. <span style='color: #00BBBB;'>ℹ</span> set `copy` = TRUE (may be slow). </CODE></PRE> ] .pull-right[ ## Could be worse ```r pixarfilms::academy %>% left_join(pixar_films, by = "film", copy = TRUE) ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># A tibble: 80 × 7</span> <span style='font-weight: bold;'>film</span> <span style='font-weight: bold;'>award_type</span> <span style='font-weight: bold;'>status</span> <span style='font-weight: bold;'>number</span> <span style='font-weight: bold;'>release_d…</span> <span style='font-weight: bold;'>run_t…</span> <span style='font-weight: bold;'>film_…</span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><date></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #BCBCBC;'> 1</span> Toy Story Animated Fea… Award… 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 2</span> Toy Story Original Scr… Nomin… 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 3</span> Toy Story Adapted Scre… Ineli… 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 4</span> Toy Story Original Sco… Nomin… 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 5</span> Toy Story Original Song Nomin… 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 6</span> Toy Story Other Won S… 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 7</span> A Bug's Life Animated Fea… Award… 2 1998-11-25 95 G <span style='color: #BCBCBC;'> 8</span> A Bug's Life Adapted Scre… Ineli… 2 1998-11-25 95 G <span style='color: #BCBCBC;'> 9</span> A Bug's Life Original Sco… Nomin… 2 1998-11-25 95 G <span style='color: #BCBCBC;'>10</span> Toy Story 2 Animated Fea… Award… 3 1999-11-24 92 G <span style='color: #949494;'># … with 70 more rows</span> </CODE></PRE> ] --- # DuckDB: register data frames as database tables Temporarily use a local data frame as a table. Also works for Arrow datasets via `duckdb::duckdb_register_arrow()`. .pull-left[ ## Register and access ```r duckdb::duckdb_register( con_duckdb, "academy_small", pixarfilms::academy[1:3, ] ) academy_small <- tbl(con_duckdb, "academy_small") academy_small ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># Source: table<academy_small> [?? x 3]</span> <span style='color: #949494;'># Database: duckdb_connection</span> <span style='font-weight: bold;'>film</span> <span style='font-weight: bold;'>award_type</span> <span style='font-weight: bold;'>status</span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #BCBCBC;'>1</span> Toy Story Animated Feature Award not yet introduced <span style='color: #BCBCBC;'>2</span> Toy Story Original Screenplay Nominated <span style='color: #BCBCBC;'>3</span> Toy Story Adapted Screenplay Ineligible </CODE></PRE> ] .pull-right[ ## Use ```r academy_small %>% left_join(pixar_films, by = "film") ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># Source: lazy query [?? x 7]</span> <span style='color: #949494;'># Database: duckdb_connection</span> <span style='font-weight: bold;'>film</span> <span style='font-weight: bold;'>award_type</span> <span style='font-weight: bold;'>status</span> <span style='font-weight: bold;'>number</span> <span style='font-weight: bold;'>release_d…</span> <span style='font-weight: bold;'>run_t…</span> <span style='font-weight: bold;'>film_…</span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><date></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #BCBCBC;'>1</span> Toy Story Adapted Screenpl… Ineli… 1 1995-11-22 81 G <span style='color: #BCBCBC;'>2</span> Toy Story Original Screenp… Nomin… 1 1995-11-22 81 G <span style='color: #BCBCBC;'>3</span> Toy Story Animated Feature Award… 1 1995-11-22 81 G </CODE></PRE> ] --- # DuckDB: Performance comparison Baseline: Data frames. .pull-left[ ```r nrow(nycflights13::flights) ``` ``` [1] 336776 ``` ] .pull-right[ ```r system.time( nycflights13::flights %>% count(year, month, day) ) ``` ``` user system elapsed 0.010 0.000 0.011 ``` ] --- # DuckDB: Performance comparison With registration. .pull-left[ ```r system.time(duckdb::duckdb_register( con_duckdb, "flights", nycflights13::flights )) ``` ``` user system elapsed 0.003 0.000 0.002 ``` ```r flights_register <- tbl(con_duckdb, "flights") flights_register %>% count() ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># Source: lazy query [?? x 1]</span> <span style='color: #949494;'># Database: duckdb_connection</span> <span style='font-weight: bold;'>n</span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #BCBCBC;'>1</span> <span style='text-decoration: underline;'>336</span>776 </CODE></PRE> ] .pull-right[ ```r system.time( flights_register %>% count(year, month, day) %>% collect() ) ``` ``` user system elapsed 0.030 0.001 0.031 ``` ] --- # DuckDB: Performance comparison With copy. .pull-left[ ```r system.time( flights_copy <- copy_to(con_duckdb, nycflights13::flights) ) ``` ``` user system elapsed 0.087 0.010 0.099 ``` ```r flights_copy %>% count() ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># Source: lazy query [?? x 1]</span> <span style='color: #949494;'># Database: duckdb_connection</span> <span style='font-weight: bold;'>n</span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #BCBCBC;'>1</span> <span style='text-decoration: underline;'>336</span>776 </CODE></PRE> ] .pull-right[ ```r system.time( flights_copy %>% count(year, month, day) %>% collect() ) ``` ``` user system elapsed 0.008 0.000 0.007 ``` ] --- # ETL, revisited Insert a second table into our database. ```r db_path <- fs::path_abs("pixar.duckdb") con <- DBI::dbConnect(duckdb::duckdb(dbdir = db_path)) DBI::dbWriteTable(con, "academy", pixarfilms::academy, overwrite = TRUE) DBI::dbExecute(con, "CREATE UNIQUE INDEX academy_pk ON academy (film, award_type)") ``` ``` [1] 0 ``` ```r DBI::dbExecute(con, "CREATE INDEX academy_fk ON academy (film)") ``` ``` [1] 0 ``` ```r DBI::dbDisconnect(con) ``` --- background-image: url("data:image/png;base64,#images/21-frame.webp") background-size: 40% background-position: 100% 100% # Joins: Exercises 1 .pull-left[ 1. How many rows does the join between `academy` and `pixar_films` contain? Try to find out without loading all the data into memory. Explain. 2. Which films are not yet listed in the `academy` table? What does the resulting SQL query look like? - Hint: Use `anti_join()` ] --- background-image: url("data:image/png;base64,#images/21-frame.webp") background-size: 40% background-position: 100% 100% # Joins: Exercises 2 .pull-left[ 3. Transform `academy` into a wide table so that there is at most one row per film. Join the resulting table with the `pixar_films` table. - Hint: Use `pivot_wider()`, `spread()`, `dcast()`, ... . You need to compute locally, because these functions don't work on the database. ] .pull-right[ 4. Plot a bar chart with the number of awards won and nominated per year. Compute as much as possible on the database. - Hint: "Long form" or "wide form"? ] --- # Schedule Extract ⇨ Transform ⇨ Load ⇨ Consume .pull-left[ ## One table - Read whole tables - Let the database do the heavy lifting - Basic ETL for one table - Subtle issues to watch out for ] .pull-right[ ## Multiple tables - Joins - **The {dm} package** - A bit of theory - Playing the whole game ] ## Bonus: Bring your own data + questions --- background-image: url("data:image/png;base64,#images/22.webp") background-size: 40% background-position: 100% 100% # Data model basics .pull-left[ - Compound object for multiple tables - Features Script: `databases_22.R` ```r library(tidyverse) library(dm) ``` ] --- background-image: url("data:image/png;base64,#images/22-frame.webp") background-size: 40% background-position: 100% 100% # Data model basics .pull-left[ - Compound object for multiple tables - Features Script: `databases_22.R` ```r library(tidyverse) library(dm) ``` ] --- # Data model objects .pull-left[ Store multiple tables in an object. ```r pixar_dm <- dm_pixarfilms() pixar_dm ``` <PRE class="fansi fansi-output"><CODE><span style='color: #FFAFFF;'>──</span> <span style='color: #FFAFFF;'>Metadata</span> <span style='color: #FFAFFF;'>────────────────────────────────────────────────────────</span> Tables: `pixar_films`, `pixar_people`, `academy`, `box_office`, `genres`, `public_response` Columns: 23 Primary keys: 5 Foreign keys: 5 </CODE></PRE> ```r pixar_dm %>% dm_draw() ``` ] .pull-right[ <!-- Generated by graphviz version 2.40.1 (20161225.0304) --> <!-- Title: %0 Pages: 1 --> <svg width="315pt" height="330pt" viewBox="-100.00 0.00 315.00 330.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"> <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 326)"> <title>%0</title> <g id="a_graph0"><a xlink:title="Data Model"> <polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-326 211,-326 211,4 -4,4"/> </a> </g> <!-- academy --> <g id="node1" class="node"> <title>academy</title> <polygon fill="#ed7d31" stroke="transparent" points="1.5,-301 1.5,-321 100.5,-321 100.5,-301 1.5,-301"/> <text text-anchor="start" x="26.1255" y="-306.4" font-family="Times,serif" font-size="14.00" fill="#ffffff">academy</text> <polygon fill="#fbe5d5" stroke="transparent" points="1.5,-281 1.5,-301 100.5,-301 100.5,-281 1.5,-281"/> <text text-anchor="start" x="3.5" y="-286.4" font-family="Times,serif" font-size="14.00" fill="#444444">film</text> <polygon fill="#fbe5d5" stroke="transparent" points="1.5,-261 1.5,-281 100.5,-281 100.5,-261 1.5,-261"/> <text text-anchor="start" x="3.183" y="-267.4" font-family="Times,serif" text-decoration="underline" font-size="14.00" fill="#444444">film, award_type</text> <polygon fill="none" stroke="#9e5320" stroke-opacity="0.666667" points="0,-260 0,-322 101,-322 101,-260 0,-260"/> </g> <!-- pixar_films --> <g id="node4" class="node"> <title>pixar_films</title> <polygon fill="#5b9bd5" stroke="transparent" points="138,-161 138,-181 206,-181 206,-161 138,-161"/> <text text-anchor="start" x="139.7286" y="-166.4" font-family="Times,serif" font-size="14.00" fill="#ffffff">pixar_films</text> <polygon fill="#deebf6" stroke="transparent" points="138,-141 138,-161 206,-161 206,-141 138,-141"/> <text text-anchor="start" x="140" y="-147.4" font-family="Times,serif" text-decoration="underline" font-size="14.00" fill="#444444">film</text> <polygon fill="none" stroke="#3c678e" stroke-opacity="0.666667" points="137,-140 137,-182 207,-182 207,-140 137,-140"/> </g> <!-- academy->pixar_films --> <g id="edge2" class="edge"> <title>academy:film->pixar_films:film</title> <path fill="none" stroke="#555555" d="M100.5,-291C161.0155,-291 84.073,-167.4399 127.9432,-152.4775"/> <polygon fill="#555555" stroke="#555555" points="128.615,-155.9165 138,-151 127.5974,-148.9909 128.615,-155.9165"/> </g> <!-- box_office --> <g id="node2" class="node"> <title>box_office</title> <polygon fill="#ed7d31" stroke="transparent" points="18.5,-221 18.5,-241 82.5,-241 82.5,-221 18.5,-221"/> <text text-anchor="start" x="20.1795" y="-226.4" font-family="Times,serif" font-size="14.00" fill="#ffffff">box_office</text> <polygon fill="#fbe5d5" stroke="transparent" points="18.5,-201 18.5,-221 82.5,-221 82.5,-201 18.5,-201"/> <text text-anchor="start" x="20.5" y="-207.4" font-family="Times,serif" text-decoration="underline" font-size="14.00" fill="#444444">film</text> <polygon fill="none" stroke="#9e5320" stroke-opacity="0.666667" points="17.5,-200 17.5,-242 83.5,-242 83.5,-200 17.5,-200"/> </g> <!-- box_office->pixar_films --> <g id="edge3" class="edge"> <title>box_office:film->pixar_films:film</title> <path fill="none" stroke="#555555" d="M82.5,-211C114.9945,-211 104.6973,-162.9888 128.1654,-152.8615"/> <polygon fill="#555555" stroke="#555555" points="128.8254,-156.2988 138,-151 127.5235,-149.4209 128.8254,-156.2988"/> </g> <!-- genres --> <g id="node3" class="node"> <title>genres</title> <polygon fill="#ed7d31" stroke="transparent" points="18.5,-161 18.5,-181 83.5,-181 83.5,-161 18.5,-161"/> <text text-anchor="start" x="32.7328" y="-166.4" font-family="Times,serif" font-size="14.00" fill="#ffffff">genres</text> <polygon fill="#fbe5d5" stroke="transparent" points="18.5,-141 18.5,-161 83.5,-161 83.5,-141 18.5,-141"/> <text text-anchor="start" x="20.5" y="-146.4" font-family="Times,serif" font-size="14.00" fill="#444444">film</text> <polygon fill="#fbe5d5" stroke="transparent" points="18.5,-121 18.5,-141 83.5,-141 83.5,-121 18.5,-121"/> <text text-anchor="start" x="20.2889" y="-127.4" font-family="Times,serif" text-decoration="underline" font-size="14.00" fill="#444444">film, genre</text> <polygon fill="none" stroke="#9e5320" stroke-opacity="0.666667" points="17,-120 17,-182 84,-182 84,-120 17,-120"/> </g> <!-- genres->pixar_films --> <g id="edge4" class="edge"> <title>genres:film->pixar_films:film</title> <path fill="none" stroke="#555555" d="M83.5,-151C103.9375,-151 111.4419,-151 127.8378,-151"/> <polygon fill="#555555" stroke="#555555" points="128,-154.5001 138,-151 128,-147.5001 128,-154.5001"/> </g> <!-- pixar_people --> <g id="node5" class="node"> <title>pixar_people</title> <polygon fill="#70ad47" stroke="transparent" points="12.5,-81 12.5,-101 89.5,-101 89.5,-81 12.5,-81"/> <text text-anchor="start" x="14.4572" y="-86.4" font-family="Times,serif" font-size="14.00" fill="#ffffff">pixar_people</text> <polygon fill="#e2eeda" stroke="transparent" points="12.5,-61 12.5,-81 89.5,-81 89.5,-61 12.5,-61"/> <text text-anchor="start" x="14.5" y="-66.4" font-family="Times,serif" font-size="14.00" fill="#444444">film</text> <polygon fill="none" stroke="#4a732f" stroke-opacity="0.666667" points="11,-60 11,-102 90,-102 90,-60 11,-60"/> </g> <!-- pixar_people->pixar_films --> <g id="edge1" class="edge"> <title>pixar_people:film->pixar_films:film</title> <path fill="none" stroke="#555555" d="M89.5,-71C127.1813,-71 102.2491,-136.7031 128.1904,-149.0225"/> <polygon fill="#555555" stroke="#555555" points="127.5055,-152.4547 138,-151 128.8889,-145.5928 127.5055,-152.4547"/> </g> <!-- public_response --> <g id="node6" class="node"> <title>public_response</title> <polygon fill="#ed7d31" stroke="transparent" points="3.5,-21 3.5,-41 97.5,-41 97.5,-21 3.5,-21"/> <text text-anchor="start" x="5.0126" y="-26.4" font-family="Times,serif" font-size="14.00" fill="#ffffff">public_response</text> <polygon fill="#fbe5d5" stroke="transparent" points="3.5,-1 3.5,-21 97.5,-21 97.5,-1 3.5,-1"/> <text text-anchor="start" x="5.5" y="-7.4" font-family="Times,serif" text-decoration="underline" font-size="14.00" fill="#444444">film</text> <polygon fill="none" stroke="#9e5320" stroke-opacity="0.666667" points="2.5,0 2.5,-42 98.5,-42 98.5,0 2.5,0"/> </g> <!-- public_response->pixar_films --> <g id="edge5" class="edge"> <title>public_response:film->pixar_films:film</title> <path fill="none" stroke="#555555" d="M97.5,-11C158.3517,-11 83.4456,-134.5601 127.8579,-149.5225"/> <polygon fill="#555555" stroke="#555555" points="127.5999,-153.0217 138,-151 128.6091,-146.0949 127.5999,-153.0217"/> </g> </g> </svg> ] --- # Data model objects Use like a named list. ```r names(pixar_dm) ``` ``` [1] "pixar_films" "pixar_people" "academy" "box_office" "genres" "public_response" ``` .pull-left[ ```r pixar_dm$pixar_films ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># A tibble: 27 × 5</span> <span style='font-weight: bold;'>number</span> <span style='font-weight: bold;'>film</span> <span style='font-weight: bold;'>release_date</span> <span style='font-weight: bold;'>run_time</span> <span style='font-weight: bold;'>film_rating</span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><date></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #BCBCBC;'> 1</span> 1 Toy Story 1995-11-22 81 G <span style='color: #BCBCBC;'> 2</span> 2 A Bug's Life 1998-11-25 95 G <span style='color: #BCBCBC;'> 3</span> 3 Toy Story 2 1999-11-24 92 G <span style='color: #BCBCBC;'> 4</span> 4 Monsters, Inc. 2001-11-02 92 G <span style='color: #BCBCBC;'> 5</span> 5 Finding Nemo 2003-05-30 100 G <span style='color: #BCBCBC;'> 6</span> 6 The Incredibles 2004-11-05 115 PG <span style='color: #BCBCBC;'> 7</span> 7 Cars 2006-06-09 117 G <span style='color: #BCBCBC;'> 8</span> 8 Ratatouille 2007-06-29 111 G <span style='color: #BCBCBC;'> 9</span> 9 WALL-E 2008-06-27 98 G <span style='color: #BCBCBC;'>10</span> 10 Up 2009-05-29 96 PG <span style='color: #949494;'># … with 17 more rows</span> </CODE></PRE> ] .pull-right[ ```r pixar_dm$academy ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># A tibble: 80 × 3</span> <span style='font-weight: bold;'>film</span> <span style='font-weight: bold;'>award_type</span> <span style='font-weight: bold;'>status</span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #BCBCBC;'> 1</span> Toy Story Animated Feature Award not yet introduced <span style='color: #BCBCBC;'> 2</span> Toy Story Original Screenplay Nominated <span style='color: #BCBCBC;'> 3</span> Toy Story Adapted Screenplay Ineligible <span style='color: #BCBCBC;'> 4</span> Toy Story Original Score Nominated <span style='color: #BCBCBC;'> 5</span> Toy Story Original Song Nominated <span style='color: #BCBCBC;'> 6</span> Toy Story Other Won Special Achievement <span style='color: #BCBCBC;'> 7</span> A Bug's Life Animated Feature Award not yet introduced <span style='color: #BCBCBC;'> 8</span> A Bug's Life Adapted Screenplay Ineligible <span style='color: #BCBCBC;'> 9</span> A Bug's Life Original Score Nominated <span style='color: #BCBCBC;'>10</span> Toy Story 2 Animated Feature Award not yet introduced <span style='color: #949494;'># … with 70 more rows</span> </CODE></PRE> ] --- # Showcase: wrapping all tables in a data model One of the many operations supported by {dm}. .pull-left[ ```r pixar_films_wrapped <- pixar_dm %>% dm_wrap_tbl(pixar_films) %>% pull_tbl(pixar_films) pixar_films_wrapped ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># A tibble: 27 × 10</span> <span style='font-weight: bold;'>number</span> <span style='font-weight: bold;'>film</span> <span style='font-weight: bold;'>release_d…</span> <span style='font-weight: bold;'>run_t…</span> <span style='font-weight: bold;'>film_…</span> <span style='font-weight: bold;'>pixar_p…</span> <span style='font-weight: bold;'>academy</span> <span style='font-weight: bold;'>box_off…</span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><date></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><nested></span> <span style='color: #949494; font-style: italic;'><nested></span> <span style='color: #949494; font-style: italic;'><nested></span> <span style='color: #BCBCBC;'> 1</span> 1 Toy S… 1995-11-22 81 G <tibble> <tibble> <tibble> <span style='color: #BCBCBC;'> 2</span> 2 A Bug… 1998-11-25 95 G <tibble> <tibble> <tibble> <span style='color: #BCBCBC;'> 3</span> 3 Toy S… 1999-11-24 92 G <tibble> <tibble> <tibble> <span style='color: #BCBCBC;'> 4</span> 4 Monst… 2001-11-02 92 G <tibble> <tibble> <tibble> <span style='color: #BCBCBC;'> 5</span> 5 Findi… 2003-05-30 100 G <tibble> <tibble> <tibble> <span style='color: #BCBCBC;'> 6</span> 6 The I… 2004-11-05 115 PG <tibble> <tibble> <tibble> <span style='color: #BCBCBC;'> 7</span> 7 Cars 2006-06-09 117 G <tibble> <tibble> <tibble> <span style='color: #BCBCBC;'> 8</span> 8 Ratat… 2007-06-29 111 G <tibble> <tibble> <tibble> <span style='color: #BCBCBC;'> 9</span> 9 WALL-E 2008-06-27 98 G <tibble> <tibble> <tibble> <span style='color: #BCBCBC;'>10</span> 10 Up 2009-05-29 96 PG <tibble> <tibble> <tibble> <span style='color: #949494;'># … with 17 more rows, and 2 more variables: </span><span style='color: #949494; font-weight: bold;'>genres</span><span style='color: #949494;'> <nested>,</span> <span style='color: #949494;'># </span><span style='color: #949494; font-weight: bold;'>public_response</span><span style='color: #949494;'> <nested></span> </CODE></PRE> ] .pull-right[ ```r pixar_films_wrapped$academy[1:2] ``` <PRE class="fansi fansi-output"><CODE>[[1]] <span style='color: #949494;'># A tibble: 6 × 2</span> <span style='font-weight: bold;'>award_type</span> <span style='font-weight: bold;'>status</span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #BCBCBC;'>1</span> Animated Feature Award not yet introduced <span style='color: #BCBCBC;'>2</span> Original Screenplay Nominated <span style='color: #BCBCBC;'>3</span> Adapted Screenplay Ineligible <span style='color: #BCBCBC;'>4</span> Original Score Nominated <span style='color: #BCBCBC;'>5</span> Original Song Nominated <span style='color: #BCBCBC;'>6</span> Other Won Special Achievement [[2]] <span style='color: #949494;'># A tibble: 3 × 2</span> <span style='font-weight: bold;'>award_type</span> <span style='font-weight: bold;'>status</span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #BCBCBC;'>1</span> Animated Feature Award not yet introduced <span style='color: #BCBCBC;'>2</span> Adapted Screenplay Ineligible <span style='color: #BCBCBC;'>3</span> Original Score Nominated </CODE></PRE> ] --- background-image: url("data:image/png;base64,#images/22-frame.webp") background-size: 40% background-position: 100% 100% # Data model basics: Exercises .pull-left[ - Experiment! ] --- # Schedule Extract ⇨ Transform ⇨ Load ⇨ Consume .pull-left[ ## One table - Read whole tables - Let the database do the heavy lifting - Basic ETL for one table - Subtle issues to watch out for ] .pull-right[ ## Multiple tables - Joins - The {dm} package - **A bit of theory** - Playing the whole game ] ## Bonus: Bring your own data + questions --- background-image: url("data:image/png;base64,#images/23.webp") background-size: 40% background-position: 100% 100% # Data models .pull-left[ - Keys, relationships, constraints - Zooming Script: `databases_23.R` ```r library(tidyverse) library(dm) ``` ] --- background-image: url("data:image/png;base64,#images/23-frame.webp") background-size: 40% background-position: 100% 100% # Data models .pull-left[ - Keys, relationships, constraints - Zooming Script: `databases_23.R` ```r library(tidyverse) library(dm) ``` ] --- # Primary keys Column(s) that uniquely identify rows in a table. .pull-left[ ```r any(duplicated(pixar_dm$pixar_films$film)) ``` ``` [1] FALSE ``` ```r check_key(pixar_dm$pixar_films, film) ``` ] .pull-right[ ```r any(duplicated(pixar_dm$academy[c("film", "award_type")])) ``` ``` [1] FALSE ``` ```r check_key(pixar_dm$academy, film, award_type) try( check_key(pixar_dm$academy, film) ) ``` ``` Error in abort_not_unique_key(as_label(data_q), orig_names) : (`film`) not a unique key of `pixar_dm$academy`. ``` ] --- # Foreign keys Column(s) that point to a primary key in another table. ```r all(pixar_dm$academy$film %in% pixar_dm$pixar_films$film) ``` ``` [1] TRUE ``` ```r check_subset(pixar_dm$academy, film, pixar_dm$pixar_films, film) try( check_subset(pixar_dm$pixar_films, film, pixar_dm$academy, film) ) ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># A tibble: 4 × 5</span> <span style='font-weight: bold;'>number</span> <span style='font-weight: bold;'>film</span> <span style='font-weight: bold;'>release_date</span> <span style='font-weight: bold;'>run_time</span> <span style='font-weight: bold;'>film_rating</span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><date></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #BCBCBC;'>1</span> 24 Luca 2021-06-18 151 N/A <span style='color: #BCBCBC;'>2</span> 25 Turning Red 2022-03-11 <span style='color: #BB0000;'>NA</span> N/A <span style='color: #BCBCBC;'>3</span> 26 Lightyear 2022-06-17 <span style='color: #BB0000;'>NA</span> N/A <span style='color: #BCBCBC;'>4</span> 27 <span style='color: #BB0000;'>NA</span> 2023-06-16 155 Not Rated Error in abort_not_subset_of(as_label(t1q), col_names_1, as_label(t2q), : Column (`film`) of table `pixar_dm$pixar_films` contains values (see examples above) that are not present in column (`film`) of table `pixar_dm$academy`. </CODE></PRE> --- # Constraints Properties of primary and foreign keys can be checked. ```r pixar_dm %>% dm_examine_constraints() ``` <PRE class="fansi fansi-message"><CODE><span style='color: #BBBB00;'>!</span> Unsatisfied constraints: </CODE></PRE><PRE class="fansi fansi-output"><CODE><span style='color: #BB0000;'>•</span> Table `pixar_films`: primary key `film`: has 1 missing values </CODE></PRE> ```r dm_pixarfilms(consistent = TRUE) %>% dm_examine_constraints() ``` <PRE class="fansi fansi-message"><CODE><span style='color: #00BBBB;'>ℹ</span> All constraints satisfied. </CODE></PRE> --- # Constraints An example from another dataset. ```r dm_nycflights13() %>% dm_examine_constraints() ``` <PRE class="fansi fansi-message"><CODE><span style='color: #BBBB00;'>!</span> Unsatisfied constraints: </CODE></PRE><PRE class="fansi fansi-output"><CODE><span style='color: #BB0000;'>•</span> Table `flights`: foreign key `tailnum` into table `planes`: values of `flights$tailnum` not in `planes$tailnum`: N725MQ (6), N537MQ (5), N722MQ (5), N730MQ (5), N736MQ (5), … </CODE></PRE> --- # Zooming Focusing on one table in a dm object. Allows applying data transformations on that table inside a dm object. .pull-left[ ```r pixar_dm %>% dm_zoom_to(academy) ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># Zoomed table: academy</span> <span style='color: #949494;'># A tibble: 80 × 3</span> <span style='font-weight: bold;'>film</span> <span style='font-weight: bold;'>award_type</span> <span style='font-weight: bold;'>status</span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #BCBCBC;'> 1</span> Toy Story Animated Feature Award not yet introduced <span style='color: #BCBCBC;'> 2</span> Toy Story Original Screenplay Nominated <span style='color: #BCBCBC;'> 3</span> Toy Story Adapted Screenplay Ineligible <span style='color: #BCBCBC;'> 4</span> Toy Story Original Score Nominated <span style='color: #BCBCBC;'> 5</span> Toy Story Original Song Nominated <span style='color: #BCBCBC;'> 6</span> Toy Story Other Won Special Achievement <span style='color: #BCBCBC;'> 7</span> A Bug's Life Animated Feature Award not yet introduced <span style='color: #BCBCBC;'> 8</span> A Bug's Life Adapted Screenplay Ineligible <span style='color: #BCBCBC;'> 9</span> A Bug's Life Original Score Nominated <span style='color: #BCBCBC;'>10</span> Toy Story 2 Animated Feature Award not yet introduced <span style='color: #949494;'># … with 70 more rows</span> </CODE></PRE> ] .pull-right[ ```r pixar_dm %>% dm_zoom_to(academy) %>% left_join(pixar_films, select = c(film, release_date)) ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># Zoomed table: academy</span> <span style='color: #949494;'># A tibble: 80 × 4</span> <span style='font-weight: bold;'>film</span> <span style='font-weight: bold;'>award_type</span> <span style='font-weight: bold;'>status</span> <span style='font-weight: bold;'>release_d…</span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><date></span> <span style='color: #BCBCBC;'> 1</span> Toy Story Animated Feature Award not yet introd… 1995-11-22 <span style='color: #BCBCBC;'> 2</span> Toy Story Original Screenplay Nominated 1995-11-22 <span style='color: #BCBCBC;'> 3</span> Toy Story Adapted Screenplay Ineligible 1995-11-22 <span style='color: #BCBCBC;'> 4</span> Toy Story Original Score Nominated 1995-11-22 <span style='color: #BCBCBC;'> 5</span> Toy Story Original Song Nominated 1995-11-22 <span style='color: #BCBCBC;'> 6</span> Toy Story Other Won Special Achievem… 1995-11-22 <span style='color: #BCBCBC;'> 7</span> A Bug's Life Animated Feature Award not yet introd… 1998-11-25 <span style='color: #BCBCBC;'> 8</span> A Bug's Life Adapted Screenplay Ineligible 1998-11-25 <span style='color: #BCBCBC;'> 9</span> A Bug's Life Original Score Nominated 1998-11-25 <span style='color: #BCBCBC;'>10</span> Toy Story 2 Animated Feature Award not yet introd… 1999-11-24 <span style='color: #949494;'># … with 70 more rows</span> </CODE></PRE> ] --- # Flattening Join a table to all related tables. ```r pixar_dm %>% dm_flatten_to_tbl(academy) ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># A tibble: 80 × 7</span> <span style='font-weight: bold;'>film</span> <span style='font-weight: bold;'>award_type</span> <span style='font-weight: bold;'>status</span> <span style='font-weight: bold;'>number</span> <span style='font-weight: bold;'>release_date</span> <span style='font-weight: bold;'>run_time</span> <span style='font-weight: bold;'>film_rating</span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><date></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #BCBCBC;'> 1</span> Toy Story Animated Feature Award not yet introduced 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 2</span> Toy Story Original Screenplay Nominated 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 3</span> Toy Story Adapted Screenplay Ineligible 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 4</span> Toy Story Original Score Nominated 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 5</span> Toy Story Original Song Nominated 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 6</span> Toy Story Other Won Special Achievement 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 7</span> A Bug's Life Animated Feature Award not yet introduced 2 1998-11-25 95 G <span style='color: #BCBCBC;'> 8</span> A Bug's Life Adapted Screenplay Ineligible 2 1998-11-25 95 G <span style='color: #BCBCBC;'> 9</span> A Bug's Life Original Score Nominated 2 1998-11-25 95 G <span style='color: #BCBCBC;'>10</span> Toy Story 2 Animated Feature Award not yet introduced 3 1999-11-24 92 G <span style='color: #949494;'># … with 70 more rows</span> </CODE></PRE> --- # Flattening A larger example with a different dataset. ```r dm_nycflights13() %>% dm_select(weather, -year, -month, -day, -hour) %>% dm_flatten_to_tbl(flights) ``` ``` Renaming ambiguous columns: %>% dm_rename(flights, flights.year = year) %>% dm_rename(airlines, airlines.name = name) %>% dm_rename(airports, airports.name = name) %>% dm_rename(planes, planes.year = year) ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># A tibble: 1,761 × 44</span> <span style='font-weight: bold;'>flights.year</span> <span style='font-weight: bold;'>month</span> <span style='font-weight: bold;'>day</span> <span style='font-weight: bold;'>dep_time</span> <span style='font-weight: bold;'>sched_de…</span> <span style='font-weight: bold;'>dep_d…</span> <span style='font-weight: bold;'>arr_t…</span> <span style='font-weight: bold;'>sched…</span> <span style='font-weight: bold;'>arr_d…</span> <span style='font-weight: bold;'>carri…</span> <span style='font-weight: bold;'>flight</span> <span style='font-weight: bold;'>tailn…</span> <span style='font-weight: bold;'>origin</span> <span style='font-weight: bold;'>dest</span> <span style='font-weight: bold;'>air_t…</span> <span style='font-weight: bold;'>dista…</span> <span style='font-weight: bold;'>hour</span> <span style='font-weight: bold;'>minute</span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #BCBCBC;'> 1</span> <span style='text-decoration: underline;'>2</span>013 1 10 3 <span style='text-decoration: underline;'>2</span>359 4 426 437 -<span style='color: #BB0000;'>11</span> B6 727 N571JB JFK BQN 183 <span style='text-decoration: underline;'>1</span>576 23 59 <span style='color: #BCBCBC;'> 2</span> <span style='text-decoration: underline;'>2</span>013 1 10 16 <span style='text-decoration: underline;'>2</span>359 17 447 444 3 B6 739 N564JB JFK PSE 191 <span style='text-decoration: underline;'>1</span>617 23 59 <span style='color: #BCBCBC;'> 3</span> <span style='text-decoration: underline;'>2</span>013 1 10 450 500 -<span style='color: #BB0000;'>10</span> 634 648 -<span style='color: #BB0000;'>14</span> US <span style='text-decoration: underline;'>1</span>117 N171US EWR CLT 78 529 5 0 <span style='color: #BCBCBC;'> 4</span> <span style='text-decoration: underline;'>2</span>013 1 10 520 525 -<span style='color: #BB0000;'>5</span> 813 820 -<span style='color: #BB0000;'>7</span> UA <span style='text-decoration: underline;'>1</span>018 N35204 EWR IAH 215 <span style='text-decoration: underline;'>1</span>400 5 25 <span style='color: #BCBCBC;'> 5</span> <span style='text-decoration: underline;'>2</span>013 1 10 530 530 0 824 829 -<span style='color: #BB0000;'>5</span> UA 404 N815UA LGA IAH 210 <span style='text-decoration: underline;'>1</span>416 5 30 <span style='color: #BCBCBC;'> 6</span> <span style='text-decoration: underline;'>2</span>013 1 10 531 540 -<span style='color: #BB0000;'>9</span> 832 850 -<span style='color: #BB0000;'>18</span> AA <span style='text-decoration: underline;'>1</span>141 N5EAAA JFK MIA 149 <span style='text-decoration: underline;'>1</span>089 5 40 <span style='color: #BCBCBC;'> 7</span> <span style='text-decoration: underline;'>2</span>013 1 10 535 540 -<span style='color: #BB0000;'>5</span> <span style='text-decoration: underline;'>1</span>015 <span style='text-decoration: underline;'>1</span>017 -<span style='color: #BB0000;'>2</span> B6 725 N784JB JFK BQN 191 <span style='text-decoration: underline;'>1</span>576 5 40 <span style='color: #BCBCBC;'> 8</span> <span style='text-decoration: underline;'>2</span>013 1 10 546 600 -<span style='color: #BB0000;'>14</span> 645 709 -<span style='color: #BB0000;'>24</span> B6 380 N337JB EWR BOS 39 200 6 0 <span style='color: #BCBCBC;'> 9</span> <span style='text-decoration: underline;'>2</span>013 1 10 549 600 -<span style='color: #BB0000;'>11</span> 652 724 -<span style='color: #BB0000;'>32</span> EV <span style='text-decoration: underline;'>6</span>055 N19554 LGA IAD 48 229 6 0 <span style='color: #BCBCBC;'>10</span> <span style='text-decoration: underline;'>2</span>013 1 10 550 600 -<span style='color: #BB0000;'>10</span> 649 703 -<span style='color: #BB0000;'>14</span> US <span style='text-decoration: underline;'>2</span>114 N740UW LGA BOS 36 184 6 0 <span style='color: #949494;'># … with 1,751 more rows, and 26 more variables: </span><span style='color: #949494; font-weight: bold;'>time_hour</span><span style='color: #949494;'> <dttm>, </span><span style='color: #949494; font-weight: bold;'>airlines.name</span><span style='color: #949494;'> <chr>, </span><span style='color: #949494; font-weight: bold;'>airports.name</span><span style='color: #949494;'> <chr>, </span><span style='color: #949494; font-weight: bold;'>lat</span><span style='color: #949494;'> <dbl>, </span><span style='color: #949494; font-weight: bold;'>lon</span><span style='color: #949494;'> <dbl>,</span> <span style='color: #949494;'># </span><span style='color: #949494; font-weight: bold;'>alt</span><span style='color: #949494;'> <dbl>, </span><span style='color: #949494; font-weight: bold;'>tz</span><span style='color: #949494;'> <dbl>, </span><span style='color: #949494; font-weight: bold;'>dst</span><span style='color: #949494;'> <chr>, </span><span style='color: #949494; font-weight: bold;'>tzone</span><span style='color: #949494;'> <chr>, </span><span style='color: #949494; font-weight: bold;'>planes.year</span><span style='color: #949494;'> <int>, </span><span style='color: #949494; font-weight: bold;'>type</span><span style='color: #949494;'> <chr>, </span><span style='color: #949494; font-weight: bold;'>manufacturer</span><span style='color: #949494;'> <chr>, </span><span style='color: #949494; font-weight: bold;'>model</span><span style='color: #949494;'> <chr>, </span><span style='color: #949494; font-weight: bold;'>engines</span><span style='color: #949494;'> <int>,</span> <span style='color: #949494;'># </span><span style='color: #949494; font-weight: bold;'>seats</span><span style='color: #949494;'> <int>, </span><span style='color: #949494; font-weight: bold;'>speed</span><span style='color: #949494;'> <int>, </span><span style='color: #949494; font-weight: bold;'>engine</span><span style='color: #949494;'> <chr>, </span><span style='color: #949494; font-weight: bold;'>temp</span><span style='color: #949494;'> <dbl>, </span><span style='color: #949494; font-weight: bold;'>dewp</span><span style='color: #949494;'> <dbl>, </span><span style='color: #949494; font-weight: bold;'>humid</span><span style='color: #949494;'> <dbl>, </span><span style='color: #949494; font-weight: bold;'>wind_dir</span><span style='color: #949494;'> <dbl>, </span><span style='color: #949494; font-weight: bold;'>wind_speed</span><span style='color: #949494;'> <dbl>, </span><span style='color: #949494; font-weight: bold;'>wind_gust</span><span style='color: #949494;'> <dbl>,</span> <span style='color: #949494;'># </span><span style='color: #949494; font-weight: bold;'>precip</span><span style='color: #949494;'> <dbl>, </span><span style='color: #949494; font-weight: bold;'>pressure</span><span style='color: #949494;'> <dbl>, </span><span style='color: #949494; font-weight: bold;'>visib</span><span style='color: #949494;'> <dbl></span> </CODE></PRE> --- background-image: url("data:image/png;base64,#images/23-frame.webp") background-size: 40% background-position: 100% 100% # Data models: Exercises .pull-left[ - Experiment! ] --- # Schedule Extract ⇨ Transform ⇨ Load ⇨ Consume .pull-left[ ## One table - Read whole tables - Let the database do the heavy lifting - Basic ETL for one table - Subtle issues to watch out for ] .pull-right[ ## Multiple tables - Joins - The {dm} package - A bit of theory - **Playing the whole game** ] ## Bonus: Bring your own data + questions --- background-image: url("data:image/png;base64,#images/24.webp") background-size: 40% background-position: 100% 100% # The whole game .pull-left[ - Build a local data model - Copy it to the database - Consume it Script: `databases_24.R` ```r library(tidyverse) library(dm) ``` ] --- background-image: url("data:image/png;base64,#images/24-frame.webp") background-size: 40% background-position: 100% 100% # The whole game .pull-left[ - Build a local data model - Copy it to the database - Consume it Script: `databases_24.R` ```r library(tidyverse) library(dm) ``` ] --- background-image: url("data:image/png;base64,#images/magic-hat-and-wand-nath-r.svg") background-size: 25% background-position: 100% 100% # Define transformed table - Fix type of `number` column - Extract `franchise` and `sequel` columns ```r pixar_films_clean ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># A tibble: 26 × 7</span> <span style='font-weight: bold;'>number</span> <span style='font-weight: bold;'>film</span> <span style='font-weight: bold;'>franchise</span> <span style='font-weight: bold;'>sequel</span> <span style='font-weight: bold;'>release_date</span> <span style='font-weight: bold;'>run_time</span> <span style='font-weight: bold;'>film_rating</span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><date></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #BCBCBC;'> 1</span> 1 Toy Story Toy Story 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 2</span> 2 A Bug's Life A Bug's Life <span style='color: #BB0000;'>NA</span> 1998-11-25 95 G <span style='color: #BCBCBC;'> 3</span> 3 Toy Story 2 Toy Story 2 1999-11-24 92 G <span style='color: #BCBCBC;'> 4</span> 4 Monsters, Inc. Monsters, Inc. <span style='color: #BB0000;'>NA</span> 2001-11-02 92 G <span style='color: #BCBCBC;'> 5</span> 5 Finding Nemo Finding Nemo <span style='color: #BB0000;'>NA</span> 2003-05-30 100 G <span style='color: #BCBCBC;'> 6</span> 6 The Incredibles The Incredibles <span style='color: #BB0000;'>NA</span> 2004-11-05 115 PG <span style='color: #BCBCBC;'> 7</span> 7 Cars Cars 1 2006-06-09 117 G <span style='color: #BCBCBC;'> 8</span> 8 Ratatouille Ratatouille <span style='color: #BB0000;'>NA</span> 2007-06-29 111 G <span style='color: #BCBCBC;'> 9</span> 9 WALL-E WALL-E <span style='color: #BB0000;'>NA</span> 2008-06-27 98 G <span style='color: #BCBCBC;'>10</span> 10 Up Up <span style='color: #BB0000;'>NA</span> 2009-05-29 96 PG <span style='color: #949494;'># … with 16 more rows</span> </CODE></PRE> --- # Define dm object Use `dm()` to create a dm object, pass tables (data frames or lazy tables). ```r base_dm <- dm( pixar_films = pixar_films_clean, academy = pixarfilms::academy, box_office = pixarfilms::box_office, ) base_dm ``` <PRE class="fansi fansi-output"><CODE><span style='color: #FFAFFF;'>──</span> <span style='color: #FFAFFF;'>Metadata</span> <span style='color: #FFAFFF;'>────────────────────────────────────────────────────────</span> Tables: `pixar_films`, `academy`, `box_office` Columns: 15 Primary keys: 0 Foreign keys: 0 </CODE></PRE> --- # Add keys Using `dm_add_pk()` and `dm_add_fk()`. .pull-left[ ```r full_dm <- base_dm %>% dm_add_pk(pixar_films, film) %>% dm_add_pk(box_office, film) %>% dm_add_fk(academy, film, pixar_films) %>% dm_add_fk(box_office, film, pixar_films) full_dm ``` <PRE class="fansi fansi-output"><CODE><span style='color: #FFAFFF;'>──</span> <span style='color: #FFAFFF;'>Metadata</span> <span style='color: #FFAFFF;'>────────────────────────────────────────────────────────</span> Tables: `pixar_films`, `academy`, `box_office` Columns: 15 Primary keys: 2 Foreign keys: 2 </CODE></PRE> ```r full_dm %>% dm_draw(view_type = "all") ``` ] .pull-right[ <!-- Generated by graphviz version 2.40.1 (20161225.0304) --> <!-- Title: %0 Pages: 1 --> <svg width="252pt" height="230pt" viewBox="0.00 0.00 252.00 230.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"> <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 226)"> <title>%0</title> <g id="a_graph0"><a xlink:title="Data Model"> <polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-226 248,-226 248,4 -4,4"/> </a> </g> <!-- academy --> <g id="node1" class="node"> <title>academy</title> <polygon fill="#efebdd" stroke="transparent" points="32.5,-201 32.5,-221 101.5,-221 101.5,-201 32.5,-201"/> <text text-anchor="start" x="42.1255" y="-206.4" font-family="Times,serif" font-size="14.00" fill="#000000">academy</text> <polygon fill="#ffffff" stroke="transparent" points="32.5,-181 32.5,-201 101.5,-201 101.5,-181 32.5,-181"/> <text text-anchor="start" x="34.5" y="-186.4" font-family="Times,serif" font-size="14.00" fill="#444444">film</text> <polygon fill="#ffffff" stroke="transparent" points="32.5,-161 32.5,-181 101.5,-181 101.5,-161 32.5,-161"/> <text text-anchor="start" x="34.3492" y="-166.4" font-family="Times,serif" font-size="14.00" fill="#444444">award_type</text> <polygon fill="#ffffff" stroke="transparent" points="32.5,-141 32.5,-161 101.5,-161 101.5,-141 32.5,-141"/> <text text-anchor="start" x="34.5" y="-146.4" font-family="Times,serif" font-size="14.00" fill="#444444">status</text> <polygon fill="none" stroke="#555555" points="31,-140 31,-222 102,-222 102,-140 31,-140"/> </g> <!-- pixar_films --> <g id="node3" class="node"> <title>pixar_films</title> <polygon fill="#efebdd" stroke="transparent" points="170.5,-171 170.5,-191 243.5,-191 243.5,-171 170.5,-171"/> <text text-anchor="start" x="174.7286" y="-176.4" font-family="Times,serif" font-size="14.00" fill="#000000">pixar_films</text> <polygon fill="#ffffff" stroke="transparent" points="170.5,-151 170.5,-171 243.5,-171 243.5,-151 170.5,-151"/> <text text-anchor="start" x="172.5" y="-156.4" font-family="Times,serif" font-size="14.00" fill="#444444">number</text> <polygon fill="#ffffff" stroke="transparent" points="170.5,-131 170.5,-151 243.5,-151 243.5,-131 170.5,-131"/> <text text-anchor="start" x="172.5" y="-137.4" font-family="Times,serif" text-decoration="underline" font-size="14.00" fill="#444444">film</text> <polygon fill="#ffffff" stroke="transparent" points="170.5,-111 170.5,-131 243.5,-131 243.5,-111 170.5,-111"/> <text text-anchor="start" x="172.5" y="-116.4" font-family="Times,serif" font-size="14.00" fill="#444444">franchise</text> <polygon fill="#ffffff" stroke="transparent" points="170.5,-91 170.5,-111 243.5,-111 243.5,-91 170.5,-91"/> <text text-anchor="start" x="172.5" y="-96.4" font-family="Times,serif" font-size="14.00" fill="#444444">sequel</text> <polygon fill="#ffffff" stroke="transparent" points="170.5,-71 170.5,-91 243.5,-91 243.5,-71 170.5,-71"/> <text text-anchor="start" x="172.413" y="-76.4" font-family="Times,serif" font-size="14.00" fill="#444444">release_date</text> <polygon fill="#ffffff" stroke="transparent" points="170.5,-51 170.5,-71 243.5,-71 243.5,-51 170.5,-51"/> <text text-anchor="start" x="172.5" y="-56.4" font-family="Times,serif" font-size="14.00" fill="#444444">run_time</text> <polygon fill="#ffffff" stroke="transparent" points="170.5,-31 170.5,-51 243.5,-51 243.5,-31 170.5,-31"/> <text text-anchor="start" x="172.5" y="-36.4" font-family="Times,serif" font-size="14.00" fill="#444444">film_rating</text> <polygon fill="none" stroke="#555555" points="169,-30 169,-192 244,-192 244,-30 169,-30"/> </g> <!-- academy->pixar_films --> <g id="edge1" class="edge"> <title>academy:film->pixar_films:film</title> <path fill="none" stroke="#555555" d="M101.5,-191C135.5254,-191 133.5378,-150.6405 160.2481,-142.4425"/> <polygon fill="#555555" stroke="#555555" points="161.0852,-145.8593 170.5,-141 160.1099,-138.9275 161.0852,-145.8593"/> </g> <!-- box_office --> <g id="node2" class="node"> <title>box_office</title> <polygon fill="#efebdd" stroke="transparent" points="1.5,-101 1.5,-121 132.5,-121 132.5,-101 1.5,-101"/> <text text-anchor="start" x="36.6795" y="-106.4" font-family="Times,serif" font-size="14.00" fill="#000000">box_office</text> <polygon fill="#ffffff" stroke="transparent" points="1.5,-81 1.5,-101 132.5,-101 132.5,-81 1.5,-81"/> <text text-anchor="start" x="3.5" y="-87.4" font-family="Times,serif" text-decoration="underline" font-size="14.00" fill="#444444">film</text> <polygon fill="#ffffff" stroke="transparent" points="1.5,-61 1.5,-81 132.5,-81 132.5,-61 1.5,-61"/> <text text-anchor="start" x="3.5" y="-66.4" font-family="Times,serif" font-size="14.00" fill="#444444">budget</text> <polygon fill="#ffffff" stroke="transparent" points="1.5,-41 1.5,-61 132.5,-61 132.5,-41 1.5,-41"/> <text text-anchor="start" x="3.5" y="-46.4" font-family="Times,serif" font-size="14.00" fill="#444444">box_office_us_canada</text> <polygon fill="#ffffff" stroke="transparent" points="1.5,-21 1.5,-41 132.5,-41 132.5,-21 1.5,-21"/> <text text-anchor="start" x="3.5" y="-26.4" font-family="Times,serif" font-size="14.00" fill="#444444">box_office_other</text> <polygon fill="#ffffff" stroke="transparent" points="1.5,-1 1.5,-21 132.5,-21 132.5,-1 1.5,-1"/> <text text-anchor="start" x="3.2447" y="-6.4" font-family="Times,serif" font-size="14.00" fill="#444444">box_office_worldwide</text> <polygon fill="none" stroke="#555555" points="0,0 0,-122 133,-122 133,0 0,0"/> </g> <!-- box_office->pixar_films --> <g id="edge2" class="edge"> <title>box_office:film->pixar_films:film</title> <path fill="none" stroke="#555555" d="M132.5,-91C156.2685,-91 146.8719,-127.2579 160.7946,-138.022"/> <polygon fill="#555555" stroke="#555555" points="159.9132,-141.4126 170.5,-141 161.9666,-134.7205 159.9132,-141.4126"/> </g> </g> </svg> ] --- # ETL, revisited All tables and keys in a dm object can be copied in one swoop. ```r db_path <- fs::path_abs("pixar.duckdb") fs::file_delete(db_path) con <- DBI::dbConnect(duckdb::duckdb(dbdir = db_path)) pixar_dm_duckdb <- copy_dm_to(con, full_dm, temporary = FALSE) ``` ``` Warning: duckdb doesn't support foreign keys, these won't be set in the remote database but are preserved in the `dm` ``` ```r pixar_dm_duckdb ``` <PRE class="fansi fansi-output"><CODE><span style='color: #00BB00;'>──</span> <span style='color: #00BB00;'>Table source</span> <span style='color: #00BB00;'>────────────────────────────────────────────────────</span> src: duckdb_connection <span style='color: #FFAFFF;'>──</span> <span style='color: #FFAFFF;'>Metadata</span> <span style='color: #FFAFFF;'>────────────────────────────────────────────────────────</span> Tables: `pixar_films`, `academy`, `box_office` Columns: 15 Primary keys: 2 Foreign keys: 2 </CODE></PRE> --- # ETL, revisited The `temporary = FALSE` argument ensures that tables are permanent. ```r pixar_dm_duckdb$pixar_films ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># Source: table<"pixar_films"> [?? x 7]</span> <span style='color: #949494;'># Database: duckdb_connection</span> <span style='font-weight: bold;'>number</span> <span style='font-weight: bold;'>film</span> <span style='font-weight: bold;'>franchise</span> <span style='font-weight: bold;'>sequel</span> <span style='font-weight: bold;'>release_date</span> <span style='font-weight: bold;'>run_time</span> <span style='font-weight: bold;'>film_rating</span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><date></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #BCBCBC;'> 1</span> 1 Toy Story Toy Story 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 2</span> 2 A Bug's Life A Bug's Life <span style='color: #BB0000;'>NA</span> 1998-11-25 95 G <span style='color: #BCBCBC;'> 3</span> 3 Toy Story 2 Toy Story 2 1999-11-24 92 G <span style='color: #BCBCBC;'> 4</span> 4 Monsters, Inc. Monsters, Inc. <span style='color: #BB0000;'>NA</span> 2001-11-02 92 G <span style='color: #BCBCBC;'> 5</span> 5 Finding Nemo Finding Nemo <span style='color: #BB0000;'>NA</span> 2003-05-30 100 G <span style='color: #BCBCBC;'> 6</span> 6 The Incredibles The Incredibles <span style='color: #BB0000;'>NA</span> 2004-11-05 115 PG <span style='color: #BCBCBC;'> 7</span> 7 Cars Cars 1 2006-06-09 117 G <span style='color: #BCBCBC;'> 8</span> 8 Ratatouille Ratatouille <span style='color: #BB0000;'>NA</span> 2007-06-29 111 G <span style='color: #BCBCBC;'> 9</span> 9 WALL-E WALL-E <span style='color: #BB0000;'>NA</span> 2008-06-27 98 G <span style='color: #BCBCBC;'>10</span> 10 Up Up <span style='color: #BB0000;'>NA</span> 2009-05-29 96 PG <span style='color: #949494;'># … with more rows</span> </CODE></PRE> --- # ETL, revisited Operations on the database dm work the same as for the local dm. ```r pixar_dm_duckdb %>% dm_flatten_to_tbl(academy) ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># Source: lazy query [?? x 9]</span> <span style='color: #949494;'># Database: duckdb_connection</span> <span style='font-weight: bold;'>film</span> <span style='font-weight: bold;'>award_type</span> <span style='font-weight: bold;'>status</span> <span style='font-weight: bold;'>number</span> <span style='font-weight: bold;'>franchise</span> <span style='font-weight: bold;'>sequel</span> <span style='font-weight: bold;'>release_date</span> <span style='font-weight: bold;'>run_time</span> <span style='font-weight: bold;'>film_rating</span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><date></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #BCBCBC;'> 1</span> Toy Story Animated Feature Award not yet introduced 1 Toy Story 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 2</span> Toy Story Original Screenplay Nominated 1 Toy Story 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 3</span> Toy Story Adapted Screenplay Ineligible 1 Toy Story 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 4</span> Toy Story Original Score Nominated 1 Toy Story 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 5</span> Toy Story Original Song Nominated 1 Toy Story 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 6</span> Toy Story Other Won Special Achievement 1 Toy Story 1 1995-11-22 81 G <span style='color: #BCBCBC;'> 7</span> A Bug's Life Animated Feature Award not yet introduced 2 A Bug's Life <span style='color: #BB0000;'>NA</span> 1998-11-25 95 G <span style='color: #BCBCBC;'> 8</span> A Bug's Life Adapted Screenplay Ineligible 2 A Bug's Life <span style='color: #BB0000;'>NA</span> 1998-11-25 95 G <span style='color: #BCBCBC;'> 9</span> A Bug's Life Original Score Nominated 2 A Bug's Life <span style='color: #BB0000;'>NA</span> 1998-11-25 95 G <span style='color: #BCBCBC;'>10</span> Toy Story 2 Animated Feature Award not yet introduced 3 Toy Story 2 1999-11-24 92 G <span style='color: #949494;'># … with more rows</span> </CODE></PRE> --- # Consume, revisited Keys are not (yet) loaded for DuckDB. This works better for SQL Server and Postgres. ```r DBI::dbDisconnect(con) con <- DBI::dbConnect(duckdb::duckdb(dbdir = db_path)) pixar_dm_learned <- dm_from_src(con) ``` ``` Keys could not be queried, use `learn_keys = FALSE` to mute this message. ``` ```r pixar_dm_learned ``` <PRE class="fansi fansi-output"><CODE><span style='color: #00BB00;'>──</span> <span style='color: #00BB00;'>Table source</span> <span style='color: #00BB00;'>───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────</span> src: duckdb_connection <span style='color: #FFAFFF;'>──</span> <span style='color: #FFAFFF;'>Metadata</span> <span style='color: #FFAFFF;'>───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────</span> Tables: `academy`, `box_office`, `pixar_films` Columns: 15 Primary keys: 0 Foreign keys: 0 </CODE></PRE> --- # Consume, revisited Implement a helper function for your data model. .pull-left[ ```r dm_pixarfilms_small <- function() { db_path <- fs::path_abs("pixar.duckdb") con <- DBI::dbConnect(duckdb::duckdb(dbdir = db_path)) table_names <- c("academy", "box_office", "pixar_films") dm_from_src( con, table_names = table_names, learn_keys = FALSE ) %>% dm_add_pk(pixar_films, film) %>% dm_add_pk(box_office, film) %>% dm_add_fk(academy, film, pixar_films) %>% dm_add_fk(box_office, film, pixar_films) } ``` ] .pull-right[ ```r dm_pixarfilms_small() ``` <PRE class="fansi fansi-output"><CODE><span style='color: #00BB00;'>──</span> <span style='color: #00BB00;'>Table source</span> <span style='color: #00BB00;'>────────────────────────────────────────────────────</span> src: duckdb_connection <span style='color: #FFAFFF;'>──</span> <span style='color: #FFAFFF;'>Metadata</span> <span style='color: #FFAFFF;'>────────────────────────────────────────────────────────</span> Tables: `academy`, `box_office`, `pixar_films` Columns: 15 Primary keys: 2 Foreign keys: 2 </CODE></PRE> ] --- background-image: url("data:image/png;base64,#images/magic-hat-and-wand-nath-r.svg") background-size: 25% background-position: 100% 100% # Consuming with dm Create a new derived table in the dm object. .pull-left[ ```r my_dm <- dm_pixarfilms_small() my_helper_dm <- my_dm %>% dm_zoom_to(academy) %>% filter(status == "Won") %>% count(film, name = "n_awards_won") %>% dm_insert_zoomed("academy_won") ``` ] .pull-right[ ```r my_helper_dm ``` <PRE class="fansi fansi-output"><CODE><span style='color: #00BB00;'>──</span> <span style='color: #00BB00;'>Table source</span> <span style='color: #00BB00;'>────────────────────────────────────────────────────</span> src: duckdb_connection <span style='color: #FFAFFF;'>──</span> <span style='color: #FFAFFF;'>Metadata</span> <span style='color: #FFAFFF;'>────────────────────────────────────────────────────────</span> Tables: `academy`, `box_office`, `pixar_films`, `academy_won` Columns: 17 Primary keys: 2 Foreign keys: 3 </CODE></PRE> ] --- background-image: url("data:image/png;base64,#images/magic-hat-and-wand-nath-r.svg") background-size: 25% background-position: 100% 100% # Consuming with dm Use the derived table to cross-reference box office and awards won per film. .pull-left[ ```r box_office_vs_awards <- my_helper_dm %>% dm_zoom_to(pixar_films) %>% left_join( box_office, select = c(film, box_office_worldwide) ) %>% left_join(academy_won) %>% transmute( rating = film_rating, box_office_mln = box_office_worldwide / 1e6, n_awards_won = coalesce(n_awards_won, 0) ) %>% pull_tbl() %>% collect() ``` ] .pull-right[ ```r box_office_vs_awards ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># A tibble: 26 × 3</span> <span style='font-weight: bold;'>rating</span> <span style='font-weight: bold;'>box_office_mln</span> <span style='font-weight: bold;'>n_awards_won</span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #BCBCBC;'> 1</span> G 632. 1 <span style='color: #BCBCBC;'> 2</span> G 871. 1 <span style='color: #BCBCBC;'> 3</span> PG 632. 2 <span style='color: #BCBCBC;'> 4</span> G 624. 1 <span style='color: #BCBCBC;'> 5</span> G 521. 1 <span style='color: #BCBCBC;'> 6</span> PG 735. 2 <span style='color: #BCBCBC;'> 7</span> G <span style='text-decoration: underline;'>1</span>067. 2 <span style='color: #BCBCBC;'> 8</span> PG 539. 1 <span style='color: #BCBCBC;'> 9</span> PG 858. 1 <span style='color: #BCBCBC;'>10</span> PG 807. 2 <span style='color: #949494;'># … with 16 more rows</span> </CODE></PRE> ] --- # Consuming with dm ```r ggplot(box_office_vs_awards, aes(x = box_office_mln, y = n_awards_won)) + geom_smooth() + geom_point() + facet_wrap(vars(rating)) + theme_bw(20) ``` <!-- --> --- background-image: url("data:image/png;base64,#images/24-frame.webp") background-size: 40% background-position: 100% 100% # The whole game: Exercises .pull-left[ - Experiment ] --- # Recap <table> <tr> <td rowspan=2> <img src="data:image/png;base64,#images/11-frame.webp" width="200px" /> </td> <td><img src="data:image/png;base64,#images/12-frame.webp" width="200px" /></td> <td><img src="data:image/png;base64,#images/12_2-frame.webp" width="200px" /></td> <td><img src="data:image/png;base64,#images/13-frame.webp" width="200px" /></td> <td><img src="data:image/png;base64,#images/14-frame.webp" width="200px" /></td> </tr> <tr style="background:transparent"> <td><img src="data:image/png;base64,#images/21-frame.webp" width="200px" /></td> <td><img src="data:image/png;base64,#images/22-frame.webp" width="200px" /></td> <td><img src="data:image/png;base64,#images/23-frame.webp" width="200px" /></td> <td><img src="data:image/png;base64,#images/24-frame.webp" width="200px" /></td> </tr> </table> --- # Schedule Extract ⇨ Transform ⇨ Load ⇨ Consume .pull-left[ ## One table - Read whole tables - Let the database do the heavy lifting - Basic ETL for one table - Subtle issues to watch out for ] .pull-right[ ## Multiple tables - Joins - The {dm} package - A bit of theory - Playing the whole game ] ## Bonus: Bring your own data + questions